package com.shengzai.rdd

import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

object Demo7GroupBy {

  def main(args: Array[String]): Unit = {

    val conf = new SparkConf()
    conf.setMaster("local")
    conf.setAppName("Filter")
    val sc = new SparkContext(conf)
    val stuRDD: RDD[String] = sc.textFile("hadoop_code/src/data/students.txt")
    /**
     * 根据班级分组，返回一个元组。
     * 会产生大量shuffle过程
     */
    val groupByRDD: RDD[(String, Iterable[String])] = stuRDD.groupBy(
      line => {
        line.split(",").last
      }
    )

    groupByRDD.foreach(println)
  }

}
